#import the libraries
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
import matplotlib.pylab as plt
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from scipy.stats import zscore
from scipy.stats import iqr
import sklearn.metrics as metrics
import warnings
warnings.filterwarnings('ignore')
from sklearn.decomposition import PCA
#Read the data set
vehicle=pd.read_csv('vehicle.csv', sep = ',')
vehicle
vehicle.describe(include="all").T
#Compactness has mean and median values almost similar , it signifies that it is normally distribited and has no skewness/outlier
#circularity : it also seems to be normally distribted as mean amd median has similar values
#scatter_ratio feature seems to be having some kind of skewness and outlier
#Scaled variance 1 & 2
vehicle.dtypes
# circularity, class, hollow_ratio,max.length_rectangularity, , max.length_aspect_ratio, compactness has no missing values rest all features are having some kind of missing values
# All attributes are of numerical type expect class.
vehicle.shape
vehicle.isnull().any()
vehicle.nunique()
vehicle.isnull().sum()
# Finding out the missing values.
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0)
vehicle[['circularity']] = imputer.fit_transform(vehicle[['circularity']])
vehicle[['distance_circularity']] = imputer.fit_transform(vehicle[['distance_circularity']])
vehicle[['radius_ratio']] = imputer.fit_transform(vehicle[['radius_ratio']])
vehicle[['pr.axis_aspect_ratio']] = imputer.fit_transform(vehicle[['pr.axis_aspect_ratio']])
vehicle[['scatter_ratio']] = imputer.fit_transform(vehicle[['scatter_ratio']])
vehicle[['elongatedness']] = imputer.fit_transform(vehicle[['elongatedness']])
vehicle[['pr.axis_rectangularity']] = imputer.fit_transform(vehicle[['pr.axis_rectangularity']])
vehicle[['scaled_variance']] = imputer.fit_transform(vehicle[['scaled_variance']])
vehicle[['scaled_variance.1']] = imputer.fit_transform(vehicle[['scaled_variance.1']])
vehicle[['scaled_radius_of_gyration']] = imputer.fit_transform(vehicle[['scaled_radius_of_gyration']])
vehicle[['scaled_radius_of_gyration.1']] = imputer.fit_transform(vehicle[['scaled_radius_of_gyration.1']])
vehicle[['skewness_about']] = imputer.fit_transform(vehicle[['skewness_about']])
vehicle[['skewness_about.1']] = imputer.fit_transform(vehicle[['skewness_about.1']])
vehicle[['skewness_about.2']] = imputer.fit_transform(vehicle[['skewness_about.2']])
vehicle.isnull().sum()
#If you carefully observe above, our orginal dataframe vehicle and dataframe after finding the missing values ,
#we will find that , After we imputed the datfarme series , using simpleimputer,
#we can see that the missing NaN values from our orginal vehdf datframe columns are treated and replaced using mode strategy.
#Summary View of all attribute to trace out outliers
plt.figure(figsize=(20,20))
ax = sns.boxplot(data=vehicle, orient="h")
#pr.axis_aspect_ratio, skewness_about, max_length_aspect_ratio, skewness_about_1,
#scaled_radius_of_gyration.1, scaled_variance.1, radius_ratio, skewness_about, scaled_variance.1 are some of the attributes with outliers.
#which is visible with all dotted points
#Treating Outliers.
Q1 = vehicle.quantile(0.25)
Q3 = vehicle.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
vehicle_clean = vehicle[~((vehicle < (Q1 - 1.5 * IQR)) |(vehicle > (Q3 + 1.5 * IQR))).any(axis=1)]
vehicle_clean.shape
#We can see that all out boxplot for all the attributes which had outlier have been treate and removed. Since no. of outliers were less we opted to remove it.
#Generally we avoid this as it can lead to info loss in case of large data sets with large no of outliers
# Let's Drop Class column and see the correlation Matrix &
# Pairplot Before using this dataframe for PCA as PCA should only be perfromed on independent attribute
vehicle_clean= vehicle.drop('class', axis=1)
corr = vehicle.corr()
fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(111)
cax = ax.matshow(corr,cmap='coolwarm', vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,len(vehicle.columns),1)
ax.set_xticks(ticks)
plt.xticks(rotation=90)
ax.set_yticks(ticks)
ax.set_xticklabels(vehicle.columns)
ax.set_yticklabels(vehicle.columns)
plt.show()
covMatrix = np.cov(vehicle,rowvar=False)
print(covMatrix)
# Strong/fare Correlation:
# - Scaled Variance & Scaled Variance.1 seems to be strongly correlated with value of 0.98
# - skewness_about_2 and hollow_ratio seems to be strongly correlated, corr coeff: 0.89
# - ditance_circularity and radius_ratio seems to have high positive correlation with corr coeff: 0.81
# - compactness & circularity , radius_ratio & pr.axis_aspect_ratio also seems ver averagely correlated with coeff: 0.67.
# - scaled _variance and scaled_radius_of_gyration, circularity & distance_circularity also seems to be highly correlated with corr coeff: 0.79
# - pr.axis_recatngularity and max.length_recatngularity also seems to be strongly correlated with coeff: 0.81
# - scatter_ratio and elongatedness seems to be have strong negative correlation val : 0.97
# - elongatedness and pr.axis_rectangularity seems to have strong negative correlation, val: 0.95
# Little To No Correlation:
# -max_length_aspect_ratio & radius_ratio have average correlation with coeff: 0.5
# - pr.axis_aspect_ratio & max_length_aspect_ratio seems to have very little correlation
# - scaled_radius_gyration & scaled_radisu_gyration.1 seems to be very little correlated
# - scaled_radius_gyration.1 & skewness_about seems to be very little correlated
# - skewness_about & skewness_about.1 not be correlated
# - skewness_about.1 and skewness_about.2 are not correlated.
sns.pairplot(vehicle.iloc[:,:])
plt.show()
#As observed in our correlation heatmap our pairplot seems to validate the same. Scaled Variance & Scaled Variance.1 seems to be have very strong positive correlation with value of 0.98.
#skewness_about_2 and hollow_ratio also seems to have strong positive correation with coeff: 0.89
#scatter_ratio and elongatedness seems to be have very strong negative correlation. elongatedness and pr.axis_rectangularity seems to have strong negative correlation with val of
#We found from our pairplot analysis that, Scaled Variance & Scaled Variance.1 and elongatedness and pr.axis_rectangularity to be strongly correlated ,
#so they need to dropped of treated carefully before we go for model building.
#Most of the data attributes seems to be normally distributed
#scaled valriance 1 and skewness about 1 and 2, scatter_ratio, seems to be right skwed .
#pr.axis_rectangularity seems to be haing outliers as there are some gaps found in the plot.
# From above correlation matrix we can see that there are many features which are highly correlated.
# if we carefully analyse, we will find that many features are there which having more than 0.9 correlation.
# so we can decide to get rid of those columns whose correlation is +-0.9 or above.There are 8 such columns:
# 1. max.length_rectangularity
# 2. scaled_radius_of_gyration
# 3. skewness_about.2
# 4. scatter_ratio
# 5. elongatedness
# 6. pr.axis_rectangularity
# 7. scaled_variance
# 8. scaled_variance.1
vehicle = vehicle.drop(['max.length_rectangularity', 'scaled_radius_of_gyration', 'skewness_about.2', 'scatter_ratio', 'elongatedness', 'pr.axis_rectangularity', 'scaled_variance', 'scaled_variance.1'], axis=1)
for feature in vehicle.columns: # Loop through all columns in the dataframe
if vehicle[feature].dtype == 'object': # Only apply for columns with categorical strings
vehicle[feature] = pd.Categorical(vehicle[feature]).codes # Replace strings with an integer
vehicle.groupby("class").count()
# Spliting the data into Train and test
X = vehicle.drop("class" , axis=1)
y = vehicle["class"]
test_size = 0.30 # taking 70:30 training and test set
seed = 100 # Random numbmer seeding for reapeatability of the code
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
from sklearn.svm import SVC
# Building a Support Vector Machine on train data
svc_model = SVC(C= 1, kernel='rbf', gamma= 1)
svc_model.fit(X_train, y_train)
y_pred = svc_model.predict(X_test)
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
print(cnf_matrix)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))
from sklearn.model_selection import KFold # import KFold
kf = KFold(n_splits=10) # Define the split - into 2 folds
kf.get_n_splits(X) # returns the number of splitting iterations in the cross-validator
print(kf)
from scipy.stats import zscore
vehicle=X.apply(zscore)
vehicle.head()
covMatrix = np.cov(vehicle,rowvar=False)
print(covMatrix)
pca = PCA(n_components=10)
pca.fit(vehicle)
print(pca.explained_variance_)
print(pca.components_)
print(pca.explained_variance_ratio_)
plt.bar(list(range(0,10)),pca.explained_variance_ratio_,alpha=0.5, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.show()
plt.step(list(range(0,10)),np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Cum of variation explained')
plt.xlabel('eigen Value')
plt.show()
# Now 8 dimensions seems very reasonable. With 8 variables we can explain over 95% of the variation in the original data!
pca3 = PCA(n_components=8)
pca3.fit(vehicle)
print(pca3.components_)
print(pca3.explained_variance_ratio_)
Xpca3 = pca3.transform(vehicle)
Xpca3
sns.pairplot(pd.DataFrame(Xpca3))
plt.show()
Xpca3_train, Xpca3_test, y_train, y_test = train_test_split(Xpca3, y, test_size=0.3, random_state=100)
# Building a Support Vector Machine on PCA train data
Xpca3_svc_model = SVC(C= 1, kernel='rbf', gamma= 1)
Xpca3_svc_model.fit(Xpca3_train, y_train)
Xpca3_y_pred = Xpca3_svc_model.predict(Xpca3_test)
cnf_matrix = metrics.confusion_matrix(y_test, Xpca3_y_pred)
print(cnf_matrix)
print("Accuracy:",metrics.accuracy_score(y_test, Xpca3_y_pred))
print(metrics.classification_report(y_test, Xpca3_y_pred))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Accuracy:",metrics.accuracy_score(y_test, Xpca3_y_pred))
# We can see that after appling PCA and reducing the dimensions we were able to increase the accuracy of our model to 84%.
# Now 4 dimensions seems very reasonable. With 4 variables we can explain over 95% of the variation in the original data!
pca3_1 = PCA(n_components=4)
pca3_1.fit(vehicle)
print(pca3_1.components_)
print(pca3_1.explained_variance_ratio_)
Xpca3_1 = pca3_1.transform(vehicle)
Xpca3_1
sns.pairplot(pd.DataFrame(Xpca3_1))
plt.show()
Xpca3_1_train, Xpca3_1_test, y_train, y_test = train_test_split(Xpca3_1, y, test_size=0.3, random_state=100)
# Building a Support Vector Machine on PCA train data
Xpca3_1_svc_model = SVC(C= 1, kernel='rbf', gamma= 1)
Xpca3_1_svc_model.fit(Xpca3_1_train, y_train)
Xpca3_1_y_pred = Xpca3_1_svc_model.predict(Xpca3_1_test)
cnf_matrix = metrics.confusion_matrix(y_test, Xpca3_1_y_pred)
print(cnf_matrix)
print("Accuracy:",metrics.accuracy_score(y_test, Xpca3_1_y_pred))
print(metrics.classification_report(y_test, Xpca3_1_y_pred))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Accuracy:",metrics.accuracy_score(y_test, Xpca3_1_y_pred))
# if we reducing the dimensions to 4 we were able to increase the accuracy of our model to 77.5%.